Overview

The goals of this workshop include: - Learn about data wrangling principles (e.g., tidy data) - Import, pre-process & visualize data using R - Share recommended readings, activities for continued learning

Click Here for Workshop Slides

Intro to R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

Note that echo = FALSE, when added to the code chunk, prevents printing of the R code that generated the output.

Load libs/packages

library(readr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ dplyr   1.0.2
## ✓ tibble  3.0.4     ✓ stringr 1.4.0
## ✓ tidyr   1.1.2     ✓ forcats 0.5.0
## ✓ purrr   0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Read in data

“These Community Mobility Reports aim to provide insights into what has changed in response to policies aimed at combating COVID-19. The reports chart movement trends over time by geography, across different categories of places such as retail and recreation, groceries and pharmacies, parks, transit stations, workplaces, and residential.”

Learn More

# read in the Google Mobility dataset
google_mobility <- read_csv("https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   country_region_code = col_character(),
##   country_region = col_character(),
##   sub_region_1 = col_character(),
##   sub_region_2 = col_logical(),
##   metro_area = col_logical(),
##   iso_3166_2_code = col_character(),
##   census_fips_code = col_logical(),
##   place_id = col_character(),
##   date = col_date(format = ""),
##   retail_and_recreation_percent_change_from_baseline = col_double(),
##   grocery_and_pharmacy_percent_change_from_baseline = col_double(),
##   parks_percent_change_from_baseline = col_double(),
##   transit_stations_percent_change_from_baseline = col_double(),
##   workplaces_percent_change_from_baseline = col_double(),
##   residential_percent_change_from_baseline = col_double()
## )
## Warning: 7330820 parsing failures.
##  row        col           expected                  actual                                                                  file
## 5196 metro_area 1/0/T/F/TRUE/FALSE Kabul Metropolitan Area 'https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv'
## 5197 metro_area 1/0/T/F/TRUE/FALSE Kabul Metropolitan Area 'https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv'
## 5198 metro_area 1/0/T/F/TRUE/FALSE Kabul Metropolitan Area 'https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv'
## 5199 metro_area 1/0/T/F/TRUE/FALSE Kabul Metropolitan Area 'https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv'
## 5200 metro_area 1/0/T/F/TRUE/FALSE Kabul Metropolitan Area 'https://www.gstatic.com/covid19/mobility/Global_Mobility_Report.csv'
## .... .......... .................. ....................... .....................................................................
## See problems(...) for more details.
#google_mobility_byregion = "https://www.gstatic.com/covid19/mobility/Region_Mobility_Report_CSVs.zip"

Create lookup tables

R, as well as some packages, have built in datasets. They can be called with the data() function (e.g., data(state)). We will leverage a built in dataset with state metadata.

data(state)
state_lookup = tibble(sub_region_1 = state.name,
                      us_division = state.division,
                      us_region = state.region,
                      state_land_area = state.area)

Pre-process data

# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

# filter for US
google_mobility_US <- google_mobility %>%
  filter(country_region == "United States") %>%
  mutate(dt_date = date) %>%
  mutate(dt_month = lubridate::month(dt_date),
         dt_month_ = lubridate::month(dt_date, label=T),
         dt_year= lubridate::year(dt_date),
         dt_day = lubridate::day(dt_date),
         dt_wkday_l = lubridate::wday(dt_date, label=T),
         dt_wkday_n = lubridate::wday(dt_date)) %>%
  mutate(dt_weekend = ifelse(dt_wkday_l %in% c("Sat", "Sun"), TRUE, FALSE))

Joining datasets

google_mobility_US_w_lu <- google_mobility_US %>%
  inner_join(state_lookup)
## Joining, by = "sub_region_1"

Quick descriptives

#skimr::skim(google_mobility_US)

Filter data

# filter with specific entries
google_mobility_US_NY <- google_mobility_US %>%
  filter(sub_region_1 == "New York")

google_mobility_US_FL <- google_mobility_US %>%
  filter(sub_region_1 == "Florida")

# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

# filter to a 'vector' of entries
google_mobility_US_targeted <- google_mobility_US %>%
  filter(sub_region_1 %in% c("California", "New York", "Florida"))

Compute aggregates

# base R approach
table(google_mobility_US$sub_region_1)
## 
##              Alabama               Alaska              Arizona 
##                36832                 5860                 9131 
##             Arkansas           California             Colorado 
##                36147                32532                25976 
##          Connecticut             Delaware District of Columbia 
##                 5247                 2332                  583 
##              Florida              Georgia               Hawaii 
##                38309                78125                 2915 
##                Idaho             Illinois              Indiana 
##                18822                51305                51353 
##                 Iowa               Kansas             Kentucky 
##                47985                31176                53927 
##            Louisiana                Maine             Maryland 
##                33189                 9651                14420 
##        Massachusetts             Michigan            Minnesota 
##                 8439                44308                42804 
##          Mississippi             Missouri              Montana 
##                39079                53834                15249 
##             Nebraska               Nevada        New Hampshire 
##                25598                 7253                 6388 
##           New Jersey           New Mexico             New York 
##                12818                16626                35884 
##       North Carolina         North Dakota                 Ohio 
##                55228                10965                50615 
##             Oklahoma               Oregon         Pennsylvania 
##                35647                19260                37870 
##         Rhode Island       South Carolina         South Dakota 
##                 3489                26518                14735 
##            Tennessee                Texas                 Utah 
##                50368               110216                13291 
##              Vermont             Virginia           Washington 
##                 7904                69502                20495 
##        West Virginia            Wisconsin              Wyoming 
##                24228                38884                11003
# number of records by state
records_per_state = google_mobility_US %>%
  group_by(sub_region_1) %>%
  summarise(n_records = n()) %>%
  arrange(sub_region_1)
## `summarise()` ungrouping output (override with `.groups` argument)
records_per_state
## # A tibble: 52 x 2
##    sub_region_1         n_records
##    <chr>                    <int>
##  1 Alabama                  36832
##  2 Alaska                    5860
##  3 Arizona                   9131
##  4 Arkansas                 36147
##  5 California               32532
##  6 Colorado                 25976
##  7 Connecticut               5247
##  8 Delaware                  2332
##  9 District of Columbia       583
## 10 Florida                  38309
## # … with 42 more rows
# >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

# group data by weekday
metrics_month_weekend_US <- google_mobility_US %>%
  group_by(sub_region_1, dt_year, dt_month_, dt_wkday_l) %>%
  summarise(avg_change_parks = mean(parks_percent_change_from_baseline, na.rm=T),
            avg_change_retail_rec = mean(retail_and_recreation_percent_change_from_baseline, na.rm=T),
            avg_change_workplaces = mean(workplaces_percent_change_from_baseline, na.rm=T)) %>%
  inner_join(state_lookup)
## `summarise()` regrouping output by 'sub_region_1', 'dt_year', 'dt_month_' (override with `.groups` argument)
## Joining, by = "sub_region_1"

Visualize the data

# plot summary data across all states
allUS_retail = ggplot(metrics_month_weekend_US %>% filter(us_region %in% c("South", "West")), aes(sub_region_1, avg_change_retail_rec)) +
  geom_boxplot() +
  theme_minimal() +
  theme(axis.text.x=element_text(angle=90)) +
  facet_wrap(dt_year~.) +
  labs(x="Month", y="Retail (% Change)")

allUS_parks = ggplot(metrics_month_weekend_US %>% filter(us_region %in% c("South", "West")), aes(sub_region_1, avg_change_parks)) +
  geom_boxplot() +
  theme_minimal() +
  facet_wrap(dt_year~.) +
  theme(axis.text.x=element_text(angle=90)) +
  labs(x="Month", y="Parks (% Change)")

allUS_retail

allUS_parks

jitter_retail <- ggplot(metrics_month_weekend_US, aes(dt_month_, 
                                     avg_change_retail_rec, 
                                     group=us_region,
                                     color=us_region)) +
  #geom_point() +
  geom_jitter() +
  theme_minimal() +
  labs(x="Month", y="Retail & Recreation (% Change)")

smooth_retail = ggplot(metrics_month_weekend_US, aes(dt_month_, 
                                     avg_change_retail_rec, 
                                     group=us_region,
                                     color=us_region)) +
  #geom_point() +
  #geom_jitter() +
  geom_smooth() +
  theme_minimal() +
  labs(x="Month", y="Retail & Recreation (% Change)")

smooth_parks = ggplot(metrics_month_weekend_US, aes(dt_month_, 
                                     avg_change_parks, 
                                     group=us_region,
                                     color=us_region)) +
  #geom_point() +
  #geom_jitter() +
  geom_smooth() +
  theme_minimal() +
  labs(x="Month", y="Parks (% Change)")

jitter_retail

smooth_retail
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

smooth_parks
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Combining multiple plots

library(cowplot)

# create a plot of weekly trends
early_covid_retail = ggplot(google_mobility_US_targeted %>%
         filter(dt_month %in% c(1,2,3,4,5)) %>%
         filter(dt_year == 2020), aes(dt_date, 
                                    retail_and_recreation_percent_change_from_baseline,
                                    color=sub_region_1)) +
  #geom_point() +
  geom_jitter() +
  theme_minimal() +
  facet_wrap(.~sub_region_1) +
  labs(x="Month", y="Retal & Rec (% Change)", title="2020")

early_covid_parks = ggplot(google_mobility_US_targeted %>%
         filter(dt_month %in% c(1,2,3,4,5)) %>%
         filter(dt_year == 2020), aes(dt_date, 
                                    parks_percent_change_from_baseline,
                                    color=sub_region_1)) +
  #geom_point() +
  geom_jitter() +
  theme_minimal() +
  facet_wrap(.~sub_region_1) +
  labs(x="Month", y="Parks (% Change)", title="2020")

nxtyear_covid_retail = ggplot(google_mobility_US_targeted %>%
         filter(dt_month %in% c(1,2,3,4,5)) %>%
         filter(dt_year == 2021), aes(dt_date, 
                                    retail_and_recreation_percent_change_from_baseline,
                                    color=sub_region_1)) +
  #geom_point() +
  geom_jitter() +
  theme_minimal() +
  facet_wrap(.~sub_region_1) +
  labs(x="Month", y="Retal & Rec (% Change)", title="2021")
cowplot::plot_grid(early_covid_retail, early_covid_parks, ncol=1)
## Warning: Removed 2050 rows containing missing values (geom_point).
## Warning: Removed 7320 rows containing missing values (geom_point).

cowplot::plot_grid(early_covid_retail, nxtyear_covid_retail, ncol=1)
## Warning: Removed 2050 rows containing missing values (geom_point).
## Warning: Removed 2915 rows containing missing values (geom_point).

cowplot::plot_grid(allUS_retail, allUS_parks, ncol=1)

cowplot::plot_grid(smooth_retail, smooth_parks, ncol=1)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Filter dataset for next plots

# Variable names -----
#   retail_and_recreation_percent_change_from_baseline = col_double(),
#   grocery_and_pharmacy_percent_change_from_baseline = col_double(),
#   parks_percent_change_from_baseline = col_double(),
#   transit_stations_percent_change_from_baseline = col_double(),
#   workplaces_percent_change_from_baseline = col_double(),
#   residential_percent_change_from_baseline = col_double()


early_covid_period_data = google_mobility_US_w_lu %>%
         filter(dt_month %in% c(1,2,3,4,5,6)) %>%
         filter(dt_year == 2020) %>%
         filter(sub_region_1 %in% c("New York", "California", "Florida", "Wyoming", "Texas", "Arizona","Montana", "South Dakota", "Nevada"))

Styling your plots (theme_*)

ggplot(early_covid_period_data, aes(dt_date, 
                                    parks_percent_change_from_baseline,
                                    color=sub_region_1)) +
  #geom_point() +
  theme_void() +
  geom_jitter() +
  facet_wrap(.~sub_region_1) +
  theme(legend.position = "none")
## Warning: Removed 40493 rows containing missing values (geom_point).

ggplot(early_covid_period_data, aes(dt_date, 
                                    parks_percent_change_from_baseline,
                                    color=sub_region_1)) +
  geom_jitter() +
  facet_wrap(.~sub_region_1)
## Warning: Removed 40493 rows containing missing values (geom_point).

ggplot(early_covid_period_data, aes(dt_date, 
                                    parks_percent_change_from_baseline,
                                    color=sub_region_1)) +
  #geom_point() +
  theme_bw() +
  geom_jitter() +
  facet_wrap(.~sub_region_1)
## Warning: Removed 40493 rows containing missing values (geom_point).

ggplot(early_covid_period_data, aes(dt_date, 
                                    parks_percent_change_from_baseline,
                                    color=sub_region_1)) +
  #geom_point() +
  theme_dark() +
  geom_jitter() +
  facet_wrap(.~sub_region_1)
## Warning: Removed 40493 rows containing missing values (geom_point).

Production plot

ggplot(early_covid_period_data, aes(dt_date, 
                                    parks_percent_change_from_baseline,
                                    color=us_region)) +
  #geom_point() +
  theme_bw() +
  geom_jitter() +
  facet_wrap(.~sub_region_1) +
  labs(x= "Date", y="Retail & Recreation (% Change)")
## Warning: Removed 40493 rows containing missing values (geom_point).

ggplot(early_covid_period_data, aes(dt_date, 
                                    parks_percent_change_from_baseline,
                                    color=us_region)) +
  #geom_point() +
  geom_jitter() +
  facet_wrap(.~sub_region_1) +
  labs(x= "Date", y="Retail & Recreation (% Change)") +
  theme_bw(base_size = 16) +
  theme(axis.text.x = element_text(angle=90))
## Warning: Removed 40493 rows containing missing values (geom_point).

fig1 <- ggplot(early_covid_period_data, aes(dt_date, 
                                    retail_and_recreation_percent_change_from_baseline,
                                    color=us_region)) +
  #geom_point() +
  #geom_jitter() +
  geom_smooth() +
  facet_wrap(.~sub_region_1) +
  labs(x= "Date", y="Retail & Recreation (% Change)",
       #color="State",
       tag = "Figure 1",
       title = "Google Mobility: Change over Time",
       subtitle = "i.e., Retail & Recreation, early COVID-19 period (01/20 - 06/20)",
       caption = "Google Mobility: Retail & Recreation (% Change) over early COVID period (01/20 - 06/20)") +
  theme_bw(base_size = 12, base_family = "Arial")

fig2 <- ggplot(early_covid_period_data, aes(dt_date, 
                                    parks_percent_change_from_baseline,
                                    color=us_region)) +
  #geom_point() +
  #geom_jitter() +
  geom_smooth() +
  facet_wrap(.~sub_region_1) +
  labs(x= "Date", y="Retail & Recreation (% Change)",
       #color="State",
       tag = "Figure 2",
       title = "Google Mobility: Change over Time",
       subtitle = "i.e., Parks, early COVID-19 period (01/20 - 06/20)",
       caption = "Google Mobility: Parks (% Change) over early COVID period (01/20 - 06/20)") +
  theme_bw(base_size = 12, base_family = "Arial")
fig1
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 16583 rows containing non-finite values (stat_smooth).

fig2
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Removed 40493 rows containing non-finite values (stat_smooth).

largest_changes_parks_long <- google_mobility_US %>%
  group_by(sub_region_1, dt_year) %>%
  summarise(avg_change_parks = mean(parks_percent_change_from_baseline, na.rm=T)) %>%
  arrange(sub_region_1)
## `summarise()` regrouping output by 'sub_region_1' (override with `.groups` argument)
largest_changes_parks_wide <- largest_changes_parks_long %>%
  #pivot_wider(id_cols=c())
  pivot_wider(names_from = dt_year, values_from = avg_change_parks) %>%
  mutate(change_2021_2020 = `2021` - `2020`)

top10_changes <- largest_changes_parks_wide %>%
  arrange(-change_2021_2020) %>%
  head(n=10)

bottom10_changes <- largest_changes_parks_wide %>%
  arrange(change_2021_2020) %>%
  head(n=10)

largest_changes_parks_long
## # A tibble: 104 x 3
## # Groups:   sub_region_1 [52]
##    sub_region_1 dt_year avg_change_parks
##    <chr>          <dbl>            <dbl>
##  1 Alabama         2020            17.9 
##  2 Alabama         2021             8.85
##  3 Alaska          2020            52.9 
##  4 Alaska          2021            79.8 
##  5 Arizona         2020            -1.97
##  6 Arizona         2021            -1.91
##  7 Arkansas        2020            48.2 
##  8 Arkansas        2021            45.1 
##  9 California      2020             6.17
## 10 California      2021            12.5 
## # … with 94 more rows
largest_changes_parks_wide
## # A tibble: 52 x 4
## # Groups:   sub_region_1 [52]
##    sub_region_1         `2020` `2021` change_2021_2020
##    <chr>                 <dbl>  <dbl>            <dbl>
##  1 Alabama               17.9    8.85          -9.04  
##  2 Alaska                52.9   79.8           26.9   
##  3 Arizona               -1.97  -1.91           0.0582
##  4 Arkansas              48.2   45.1           -3.07  
##  5 California             6.17  12.5            6.31  
##  6 Colorado              23.2   20.3           -2.95  
##  7 Connecticut           51.2   49.6           -1.63  
##  8 Delaware              46.8   62.7           15.9   
##  9 District of Columbia -34.8  -20.2           14.6   
## 10 Florida              -14.7   -8.17           6.57  
## # … with 42 more rows
top10_changes
## # A tibble: 10 x 4
## # Groups:   sub_region_1 [10]
##    sub_region_1 `2020` `2021` change_2021_2020
##    <chr>         <dbl>  <dbl>            <dbl>
##  1 Alaska         52.9   79.8             26.9
##  2 Rhode Island   69.2   90.5             21.3
##  3 Maine          59.0   80.2             21.2
##  4 South Dakota  106.   126.              20.4
##  5 Hawaii        -46.5  -28.8             17.7
##  6 Vermont        52.8   69.6             16.8
##  7 Montana        53.9   70.6             16.6
##  8 Wyoming        43.8   60.1             16.4
##  9 Delaware       46.8   62.7             15.9
## 10 Oregon         46.5   62.2             15.7
bottom10_changes
## # A tibble: 10 x 4
## # Groups:   sub_region_1 [10]
##    sub_region_1  `2020` `2021` change_2021_2020
##    <chr>          <dbl>  <dbl>            <dbl>
##  1 Mississippi     4.92  -6.71           -11.6 
##  2 Kansas         67.3   57.5             -9.81
##  3 Alabama        17.9    8.85            -9.04
##  4 West Virginia   3.08  -4.78            -7.86
##  5 Massachusetts  59.3   51.4             -7.84
##  6 Louisiana      -8.68 -16.4             -7.68
##  7 Oklahoma       10.4    3.33            -7.07
##  8 Utah           61.5   56.1             -5.45
##  9 Nebraska       64.3   59.6             -4.70
## 10 Indiana        46.3   42.4             -3.88